import pandas as pd
import numpy as np


df = pd.read_csv("../../data/pd_data/penguins.csv")
# print(df.head(5))
print(df.info())

# 缺失值检查
print(df.isna().sum())

df.dropna(inplace=True)
print(df.head(5))

df['sex'] = df['sex'].astype('category')
print(df.info())


df['bill_radio'] = df['bill_length_mm'] / df['bill_depth_mm']

# 数据分箱-把体重分为三个等级
labels = ['低', '中', '高']
df['mass_level'] = pd.cut(df['body_mass_g'], bins=3, labels=labels)
print(df['mass_level'].value_counts())

# 按性别分组分析
print(df.groupby(['sex']).agg({
    'body_mass_g': ['mean', 'count']
}).reset_index())

# 按岛屿、性别分组分析
print(df.groupby(['sex', 'island']).agg({
    'body_mass_g': ['mean', 'count']
}).reset_index())


print(df.head())



